Table des matières

library("ggplot2")
library("plotly")
## 
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     last_plot
## L'objet suivant est masqué depuis 'package:stats':
## 
##     filter
## L'objet suivant est masqué depuis 'package:graphics':
## 
##     layout
library("caret")
## Le chargement a nécessité le package : lattice
bank_data = read.csv("bank.csv", sep = ";")
dplyr::as_tibble(bank_data)
## # A tibble: 4,521 × 17
##      age job   marital educa…¹ default balance housing loan  contact   day month
##    <int> <chr> <chr>   <chr>   <chr>     <int> <chr>   <chr> <chr>   <int> <chr>
##  1    30 unem… married primary no         1787 no      no    cellul…    19 oct  
##  2    33 serv… married second… no         4789 yes     yes   cellul…    11 may  
##  3    35 mana… single  tertia… no         1350 yes     no    cellul…    16 apr  
##  4    30 mana… married tertia… no         1476 yes     yes   unknown     3 jun  
##  5    59 blue… married second… no            0 yes     no    unknown     5 may  
##  6    35 mana… single  tertia… no          747 no      no    cellul…    23 feb  
##  7    36 self… married tertia… no          307 yes     no    cellul…    14 may  
##  8    39 tech… married second… no          147 yes     no    cellul…     6 may  
##  9    41 entr… married tertia… no          221 yes     no    unknown    14 may  
## 10    43 serv… married primary no          -88 yes     yes   cellul…    17 apr  
## # … with 4,511 more rows, 6 more variables: duration <int>, campaign <int>,
## #   pdays <int>, previous <int>, poutcome <chr>, y <chr>, and abbreviated
## #   variable name ¹​education

Recherche d’influence entre quelques variables explicatives avec la variable y (souscription à un service bancaire)

graph1 <- ggplot(bank_data, aes(x=y, y=duration, fill=y)) + geom_boxplot()
graph1 <- graph1 + ggtitle("L'influence de le variable duration (durée de l'appel) sur la variable souscription y") 
# graph1 <- graph1 + theme(plot.title = element_text(hjust = 0.5)) 
graph1

Nous remarquons que plus la variable duration (durée de l’appel) est grande plus il y a une probabilité que le client souscrit à un service bancaire

graph2 <- ggplot(bank_data, aes(x=y, y=age, fill=y)) + geom_boxplot()
graph2 <- graph2 + ggtitle("L'influence de le variable age sur la variable souscription y")
graph2 <- ggplotly(graph2)
graph2

Il n’y pas trop une grande différence entre la distribution d’age des souscrits (yes) et celle des non souscrits (no).Apparemment, ce n’est pas l’age qui définit si le client va souscrire à un service bancaire.

graph3 <- ggplot(bank_data, aes(y, fill=contact)) + geom_bar() 
graph3 <- graph3 + ggtitle("L'influence de la variable contact sur la variable souscription y")
graph3

Peut-etre que la variable contact a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il répond avec un téléphone mobile (cellular). De meme avec un téléphone fixe (telephone).

graph4 <- ggplot(bank_data, aes(y, fill=marital)) + geom_bar()
graph4 <- graph4 + ggtitle("L'influence de la varible marital sur la variable souscription y")
graph4

Peut-etre que la variable marital a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il est divorcé.

graph5 <- ggplot(bank_data, aes(y, fill=housing)) + geom_bar()
graph5 <- graph5 + ggtitle("L'influence de la variable housing sur la variable souscription y")
graph5 <- graph5 + xlab("y (souscription)")
graph5

Peut-etre que la variable housing a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il n’a pas de logement(housing).

graph6 <- ggplot(bank_data, aes(y, fill=loan)) + geom_bar()
graph6 <- graph6 + ggtitle("L'influence de la variable loan sur la variable souscription y")
graph6 <- graph6 + xlab("y (souscription)")
graph6

Peut-etre que la variable loan a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il a fait un pret bancaire.

Transformation des variables catégorielles en variable numérique

dummy_variables = dummyVars(~., data = bank_data)
dummy_variables_data = predict(dummy_variables, newdata = bank_data)
dummy_variables_data = as.data.frame(dummy_variables_data)

dummy_variables_data$"Souscription" = ifelse(dummy_variables_data$"yno" == 1, "No", "Yes")
dummy_variables_data$"yno" = NULL
dummy_variables_data$"yyes" = NULL

Création des jeux de données d’entrainement et de test

set.seed(3033)
training_size = floor(0.7*nrow(dummy_variables_data))
indices = sample(seq_len(nrow(dummy_variables_data)), size = training_size)
data_bank.train = dummy_variables_data[indices,]
data_bank.test = dummy_variables_data[-indices,]
dim(data_bank.train)
## [1] 3164   52
dim(data_bank.test)
## [1] 1357   52

Normalisation des données

data_preprocess_value = preProcess(data_bank.train, method = c("center","scale"))
data_bank.train.scaled = predict(data_preprocess_value,data_bank.train)
data_bank.test.scaled = predict(data_preprocess_value,data_bank.test)

Caret - downsample et upsample

table(data_bank.train.scaled[,"Souscription"])
## 
##   No  Yes 
## 2795  369
set.seed(3033)
'%ni%' = Negate("%in%")
# downsample
data_bank.train.scaled.downsample = downSample(x = data_bank.train.scaled[,colnames(data_bank.train.scaled) %ni% "Souscription"], y = as.factor(data_bank.train.scaled$"Souscription"))
names(data_bank.train.scaled.downsample)[names(data_bank.train.scaled.downsample) == "Class"]="Souscription"
table(data_bank.train.scaled.downsample[,"Souscription"])
## 
##  No Yes 
## 369 369
# upsample
data_bank.train.scaled.upsample = upSample(x = data_bank.train.scaled[,colnames(data_bank.train.scaled) %ni% "Souscription"], y = as.factor(data_bank.train.scaled$"Souscription"))
names(data_bank.train.scaled.upsample)[names(data_bank.train.scaled.upsample) == "Class"]="Souscription"
table(data_bank.train.scaled.upsample[,"Souscription"])
## 
##   No  Yes 
## 2795 2795

Prédiction avec naive bayes

set.seed(3033)
trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)
naive_bayes_desequilibree = train(Souscription ~., data = data_bank.train.scaled, method='naive_bayes', preProcess = NULL, trControl = trainControl_data)
print(naive_bayes_desequilibree)
## Naive Bayes 
## 
## 3164 samples
##   51 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2848, 2847, 2848, 2848, 2847, 2847, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa       
##   FALSE      0.8298512  0.2903414705
##    TRUE      0.8831645  0.0009392685
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
##  and adjust = 1.
prediction_naive_bayes_desequilibree = predict(naive_bayes_desequilibree, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_naive_bayes_desequilibree, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1205  152
##        Yes    0    0
##                                         
##                Accuracy : 0.888         
##                  95% CI : (0.87, 0.9043)
##     No Information Rate : 0.888         
##     P-Value [Acc > NIR] : 0.5216        
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 1.000         
##             Specificity : 0.000         
##          Pos Pred Value : 0.888         
##          Neg Pred Value :   NaN         
##              Prevalence : 0.888         
##          Detection Rate : 0.888         
##    Detection Prevalence : 1.000         
##       Balanced Accuracy : 0.500         
##                                         
##        'Positive' Class : No            
## 

Prédiction avec naive bayes sur les données downsample

set.seed(3033)

trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)

naive_bayes_downsample = train(Souscription ~., data = data_bank.train.scaled.downsample, method = "naive_bayes", preProcess = NULL, trControl = trainControl_data)
print(naive_bayes_downsample)
## Naive Bayes 
## 
## 738 samples
##  51 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 664, 664, 664, 664, 664, 665, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.6882883  0.3765059
##    TRUE      0.6715784  0.3429571
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
##  and adjust = 1.
prediction_naive_bayes_downsample = predict(naive_bayes_downsample, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_naive_bayes_downsample, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  997  46
##        Yes 208 106
##                                          
##                Accuracy : 0.8128         
##                  95% CI : (0.791, 0.8332)
##     No Information Rate : 0.888          
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.358          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8274         
##             Specificity : 0.6974         
##          Pos Pred Value : 0.9559         
##          Neg Pred Value : 0.3376         
##              Prevalence : 0.8880         
##          Detection Rate : 0.7347         
##    Detection Prevalence : 0.7686         
##       Balanced Accuracy : 0.7624         
##                                          
##        'Positive' Class : No             
## 

Prédiction avec SVM

set.seed(3033)

trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)
SVM_desequilibree = train(Souscription ~., data = data_bank.train.scaled, method = "svmLinear", preProcess = NULL)
print(SVM_desequilibree)
## Support Vector Machines with Linear Kernel 
## 
## 3164 samples
##   51 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3164, 3164, 3164, 3164, 3164, 3164, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8944894  0.2971418
## 
## Tuning parameter 'C' was held constant at a value of 1
prediction_SVM_desequilibree = predict(SVM_desequilibree, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_SVM_desequilibree, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1188  122
##        Yes   17   30
##                                           
##                Accuracy : 0.8976          
##                  95% CI : (0.8802, 0.9132)
##     No Information Rate : 0.888           
##     P-Value [Acc > NIR] : 0.1405          
##                                           
##                   Kappa : 0.2625          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9859          
##             Specificity : 0.1974          
##          Pos Pred Value : 0.9069          
##          Neg Pred Value : 0.6383          
##              Prevalence : 0.8880          
##          Detection Rate : 0.8755          
##    Detection Prevalence : 0.9654          
##       Balanced Accuracy : 0.5916          
##                                           
##        'Positive' Class : No              
## 

Les variables prédictives les plus importantes

varImp(naive_bayes_downsample, scale = F)
## ROC curve variable importance
## 
##   only 20 most important variables shown (out of 51)
## 
##                    Importance
## duration               0.8297
## contactcellular        0.6098
## contactunknown         0.6043
## poutcomeunknown        0.5935
## previous               0.5914
## pdays                  0.5889
## poutcomesuccess        0.5732
## campaign               0.5721
## monthmay               0.5637
## housingno              0.5623
## housingyes             0.5623
## balance                0.5559
## maritalmarried         0.5501
## loanno                 0.5393
## loanyes                0.5393
## maritalsingle          0.5298
## monthapr               0.5285
## monthoct               0.5285
## jobmanagement          0.5244
## educationsecondary     0.5230